# Computations
import numpy as np
import pandas as pd
# sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score,\
KFold, StratifiedShuffleSplit, ShuffleSplit, learning_curve
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com. This data is fictional and it is created by IBM data scientists.
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
df = pd.read_csv(Path.split(".")[0]+'_STD.csv')
Target = 'Attrition'
Labels = ['No', 'Yes']
In the dataset, Attrition represents whether an employee is churned or not. We would like to create a predictive model that predicts this feature.
Aditional_Columns = [Target, 'Employee Number']
X = df.drop(columns = Aditional_Columns)
y = df[Target]
fig, ax = plt.subplots(figsize=(17,20))
Temp = pd.concat([X, df[Target]], axis = 1)
Temp = Temp.corr().round(2)
Temp = Temp.loc[(Temp.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("Greens", n_colors=10),
linewidths = 0.8, vmin=0, vmax=1,
annot_kws={"size": 12},
cbar_kws={'label': Target + ' Correlation', "aspect":40, "shrink": .4, "orientation": "horizontal"})
_ = ax.set_yticklabels('')
del Temp
def Dist_Table(Inp, Target = Target):
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(dict(zip([0,1],Labels)))
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
return Table
Table = Dist_Table(Inp = df)
def Dist_Plot(Table, PieColors = ['SeaGreen', 'FireBrick'], TableColors = ['Navy','White']):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values, pull=[0, 0.1], textfont=dict(size=16),
marker=dict(colors = PieColors, line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"), legend_title_text= Target)
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.2, 0.2],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + 'Distribution' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Dist_Plot(Table)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
Colors = ['SeaGreen', 'FireBrick']
nc = 2
fig = make_subplots(rows=1, cols=nc, specs=[[{'type':'domain'}]*nc])
fig.add_trace(go.Pie(labels=Labels,
values=y_train.value_counts().values,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Labels,
values=y_test.value_counts().values,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
In this article, we implement scikit-learn's GaussianNB function which implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be \begin{align} P(x_i \mid y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i - \mu_y)^2}{2\sigma^2_y}\right) \end{align} The parameters $\sigma_y$ and $\mu_y$ are estimated using maximum likelihood.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Best_Parm(model, param_dist, Top = None,
X_train = X_train, y_train= y_train, X_test = X_test, y_test = y_test):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = KFold(n_splits = 20, shuffle = True),
n_iter = int(1e3),
scoring = 'precision',
error_score = 0,
verbose = 0,
n_jobs = 10,
return_train_score = True)
_ = grid.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid.best_score_],
'Best Paramerers': [str(grid.best_params_)],
'Precision': [grid.score(X_test,y_test)]}).round(4).style.hide_index().set_precision(4))
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
display(Table.reset_index(drop = False).head(Top).style.hide_index().\
set_precision(4).background_gradient(subset= ['mean_test_score'], cmap='Greens').\
background_gradient(subset= ['mean_fit_time'], cmap='Oranges'))
Grid_Performance_Plot(Table)
return grid
def Grid_Table(grid):
Temp = [str(x) for x in grid.cv_results_['params']]
Temp = [s.replace('{', '').replace('}', '').replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid.cv_results_['rank_test_score'],
'params':Temp,
'mean_test_score': grid.cv_results_['mean_test_score'],
'mean_fit_time': grid.cv_results_['mean_fit_time']})
Table = Table.round(4).sort_values('rank_test_score').set_index('rank_test_score')
return Table
def Grid_Performance_Plot(Table):
font = FontProperties()
font.set_weight('bold')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
Z = zip(axes, ['mean_test_score', 'mean_fit_time'], ['Blue', 'Red'],['Classification Accuracy', 'Fit Time (with caching)'])
for ax, col, c, title in Z:
_ = ax.errorbar(x = Table['params'], y = Table[col], yerr = Table[col], color = c)
_ = ax.set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlabel('Paramerers')
_ = ax.set_title(title, fontproperties=font, fontsize = 14)
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, n_splits = 10):
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [6] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Gaussian Naive-Bayes with Default Parameters')
n_splits = 20
GNB = GaussianNB()
print('Default Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GNB, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, n_splits = n_splits)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Gaussian Naive-Bayes with Default Parameters ======================================================= Default Parameters = {'priors': None, 'var_smoothing': 1e-09}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9165 ± 0.0060 | 0.8729 ± 0.0306 | 0.8938 ± 0.0143 | 863.0000 ± 0.0000 |
| Yes | 0.4768 ± 0.0474 | 0.5855 ± 0.0428 | 0.5224 ± 0.0198 | 166.0000 ± 0.0000 |
| accuracy | 0.8266 ± 0.0201 | 0.8266 ± 0.0201 | 0.8266 ± 0.0201 | 0.8266 ± 0.0201 |
| macro avg | 0.6967 ± 0.0224 | 0.7292 ± 0.0116 | 0.7081 ± 0.0160 | 1029.0000 ± 0.0000 |
| weighted avg | 0.8456 ± 0.0067 | 0.8266 ± 0.0201 | 0.8339 ± 0.0146 | 1029.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9112 ± 0.0086 | 0.8658 ± 0.0305 | 0.8876 ± 0.0171 | 370.0000 ± 0.0000 |
| Yes | 0.4510 ± 0.0561 | 0.5599 ± 0.0477 | 0.4970 ± 0.0410 | 71.0000 ± 0.0000 |
| accuracy | 0.8166 ± 0.0251 | 0.8166 ± 0.0251 | 0.8166 ± 0.0251 | 0.8166 ± 0.0251 |
| macro avg | 0.6811 ± 0.0301 | 0.7128 ± 0.0254 | 0.6923 ± 0.0280 | 441.0000 ± 0.0000 |
| weighted avg | 0.8371 ± 0.0138 | 0.8166 ± 0.0251 | 0.8247 ± 0.0201 | 441.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.47 Recall (Train) = 0.58 TPR (Train) = 0.58 TNR (Train) = 0.87 Balanced Accuracy (Train) = 0.73 Test Set =========================================================================================== Precision (Test) = 0.44 Recall (Test) = 0.56 TPR (Test) = 0.56 TNR (Test) = 0.86 Balanced Accuracy (Test) = 0.71 ====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
Priors = [np.array([x, 1- x]) for x in np.arange(.25, 1, .25)]
Priors.append((y.value_counts().values/y.count()).round(2))
param_dist = {'priors': Priors, 'var_smoothing': [10**(-x) for x in range(1,11,3)]}
Header('Gaussian Naive Bayes with the Best Parameters')
grid = Best_Parm(model = GNB, param_dist = param_dist)
Gaussian Naive Bayes with the Best Parameters ======================================================
| Best Score | Best Paramerers | Precision |
|---|---|---|
| 0.4351 | {'var_smoothing': 0.1, 'priors': array([0.84, 0.16])} | 0.4607 |
| rank_test_score | params | mean_test_score | mean_fit_time |
|---|---|---|---|
| 1 | var_smoothing: 0.1, priors: array([0.84, 0.16]) | 0.4351 | 0.0023 |
| 2 | var_smoothing: 0.0001, priors: array([0.84, 0.16]) | 0.4013 | 0.0020 |
| 2 | var_smoothing: 1e-07, priors: array([0.84, 0.16]) | 0.4013 | 0.0024 |
| 2 | var_smoothing: 1e-10, priors: array([0.84, 0.16]) | 0.4013 | 0.0022 |
| 5 | var_smoothing: 0.1, priors: array([0.75, 0.25]) | 0.3497 | 0.0020 |
| 6 | var_smoothing: 0.0001, priors: array([0.75, 0.25]) | 0.3356 | 0.0020 |
| 6 | var_smoothing: 1e-07, priors: array([0.75, 0.25]) | 0.3356 | 0.0018 |
| 6 | var_smoothing: 1e-10, priors: array([0.75, 0.25]) | 0.3356 | 0.0020 |
| 9 | var_smoothing: 0.1, priors: array([0.5, 0.5]) | 0.2741 | 0.0022 |
| 10 | var_smoothing: 0.0001, priors: array([0.5, 0.5]) | 0.2696 | 0.0021 |
| 10 | var_smoothing: 1e-07, priors: array([0.5, 0.5]) | 0.2696 | 0.0020 |
| 10 | var_smoothing: 1e-10, priors: array([0.5, 0.5]) | 0.2696 | 0.0021 |
| 13 | var_smoothing: 0.1, priors: array([0.25, 0.75]) | 0.2219 | 0.0025 |
| 14 | var_smoothing: 0.0001, priors: array([0.25, 0.75]) | 0.2218 | 0.0025 |
| 14 | var_smoothing: 1e-07, priors: array([0.25, 0.75]) | 0.2218 | 0.0023 |
| 14 | var_smoothing: 1e-10, priors: array([0.25, 0.75]) | 0.2218 | 0.0022 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('Gaussian Naive Bayes with the Best Parameters')
GNB = GaussianNB(**grid.best_params_)
print('Default Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GNB, X = X, y = y, n_splits = 20)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, n_splits = 20)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Gaussian Naive Bayes with the Best Parameters ====================================================== Default Parameters = {'priors': array([0.84, 0.16]), 'var_smoothing': 0.1}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9109 ± 0.0075 | 0.9012 ± 0.0302 | 0.9057 ± 0.0130 | 863.0000 ± 0.0000 |
| Yes | 0.5230 ± 0.0583 | 0.5404 ± 0.0529 | 0.5267 ± 0.0205 | 166.0000 ± 0.0000 |
| accuracy | 0.8430 ± 0.0186 | 0.8430 ± 0.0186 | 0.8430 ± 0.0186 | 0.8430 ± 0.0186 |
| macro avg | 0.7170 ± 0.0273 | 0.7208 ± 0.0158 | 0.7162 ± 0.0149 | 1029.0000 ± 0.0000 |
| weighted avg | 0.8483 ± 0.0081 | 0.8430 ± 0.0186 | 0.8445 ± 0.0130 | 1029.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9038 ± 0.0079 | 0.8923 ± 0.0310 | 0.8977 ± 0.0168 | 370.0000 ± 0.0000 |
| Yes | 0.4842 ± 0.0760 | 0.5049 ± 0.0461 | 0.4906 ± 0.0455 | 71.0000 ± 0.0000 |
| accuracy | 0.8299 ± 0.0254 | 0.8299 ± 0.0254 | 0.8299 ± 0.0254 | 0.8299 ± 0.0254 |
| macro avg | 0.6940 ± 0.0398 | 0.6986 ± 0.0246 | 0.6942 ± 0.0300 | 441.0000 ± 0.0000 |
| weighted avg | 0.8362 ± 0.0161 | 0.8299 ± 0.0254 | 0.8322 ± 0.0205 | 441.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.51 Recall (Train) = 0.54 TPR (Train) = 0.54 TNR (Train) = 0.90 Balanced Accuracy (Train) = 0.72 Test Set =========================================================================================== Precision (Test) = 0.47 Recall (Test) = 0.51 TPR (Test) = 0.51 TNR (Test) = 0.89 Balanced Accuracy (Test) = 0.70 ====================================================================================================
As can be seen, choosing the best parameters didn't improve our performance results.